suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(emoji)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(pheatmap)
library(plotly)
library(pvclust)
library(RColorBrewer)
library(scatterplot3d)
library(tidyverse)
library(tximport)
library(vsn)
})source(here("UPSCb-common/src/R/featureSelection.R"))pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")samples <- read_delim(file = here("doc/Samples-swap-corrected.tsv"),
delim="\t",
col_types=cols(
SampleID=col_factor(),
Time=col_factor(),
Replicate=col_factor())
) %>%
mutate(Time=relevel(Time,"std"))filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)Sort the raw data and samples
samples <- samples[match(basename(dirname(filelist)),samples$SampleID),]
stopifnot(all(samples$SampleID == basename(dirname(filelist))))name the file list vector
names(filelist) <- samples$SampleIDRead the expression at the transcript level (we have no gene information)
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",txOut = TRUE)$counts))Read the algae IDs
IDs <- scan(here("data/analysis/annotation/algae-IDs.txt"),
what = "character")Subset the data
counts <- counts[IDs,]sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "4.5% percent (2244) of 49477 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
levels = c("std", "60min", "4hrs", "12hrs", "24hrs", "72hrs", "120hrs")
ggplot(dat,aes(x,y,fill=factor(Time, levels))) +
geom_col() +
scale_y_continuous(name="reads") +
scale_fill_manual(values = pal) +
theme(axis.text.x=element_text(angle=90,size=10),
axis.title.x=element_blank()) +
labs(fill = "Time")i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")The same is done for the individual samples colored by Time
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Time=samples$Time[match(ind,samples$SampleID)])
dat$Time <- factor(dat$Time, levels)
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
scale_color_manual(values = pal[1:7])dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Time) %>%
suppressMessages()
save(dds,file=here("data/analysis/salmon/dds-sample-swap-corrected.rda"))Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
# pander(sizes)
boxplot(sizes, main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)The variance stabilisation worked adequately
meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])meanSdPlot(vst[rowSums(vst)>0,])pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)We define the number of variable of the model
nvar=1An the number of possible combinations
nlevel=nlevels(dds$Time)We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
⭐ The PCA shows that a large fraction of the variance is explained by first 3 components.
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
PC3=pc$x[,3],
samples)
#PC1 vs PC2
p1 <- pc.dat %>%
ggplot(mapping = aes(x=PC1,y=PC2,text=gsub(replacement = "", x = SampleID, pattern = "_B2_2"))) +
geom_point(size = 2) +
aes(color=Time) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
p1 %<>% ggplotly() %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()
#PC1 vs PC3
p2 <- pc.dat %>%
ggplot(mapping = aes(x=PC1,y=PC3,text=gsub(replacement = "", x = SampleID, pattern = "_B2_2"))) +
geom_point(size = 2) +
aes(color=Time) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
p2 %<>% ggplotly() %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()subplot(style(p1, showlegend = F), p2,
titleX = TRUE, titleY = TRUE, nrows = 1, margin = c(0.05, 0.05, 0, 0))Filter for noise
conds <- factor(dds$Time)
sels <- rangeFeatureSelect(counts=vst,
conditions=dds$Time,
nrep=3)vst.cutoff <- 2nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)⚠️ 24.4% (12071) of total 49477 genes are plotted below:
mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
#annotation column
colnames(mat) <- gsub("B2_2_","",colnames(mat))
col_anno <- samples %>% select(Time) %>% as.data.frame()
col_anno$Time <- factor(col_anno$Time, levels)
rownames(col_anno) <- colnames(mat)
#annotation colors
col_anno_col = pal[1:7]
names(col_anno_col) <- levels
col_anno_col=list(Time = col_anno_col)
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_cols = "correlation",
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
annotation_col = col_anno,
show_rownames = F,
labels_col = conds,
angle_col = 90,
annotation_colors = col_anno_col,
legend = F)plot(as.hclust(hm$tree_col),xlab="",sub="")hm.pvclust <- suppressMessages(
pvclust(data = t(scale(t(vst[sels[[vst.cutoff]],]))),
method.hclust = "ward.D2",
nboot = 100, parallel = TRUE, quiet = T)
)
plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)print(hm.pvclust, digits=3)##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 0.917 0.948 0.987 0.170 0.120 0.009 -1.933 -0.305 0.796
## 3 0.956 0.976 0.987 0.072 0.045 0.006 -2.100 -0.122 0.885
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 10 0.810 0.911 0.885 0.084 0.050 0.011 -1.273 0.072 0.502
## 11 0.994 0.998 0.994 0.022 0.011 0.007 -2.651 0.163 0.349
## 12 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 18 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 19 0.854 0.898 0.990 0.266 0.210 0.007 -1.795 -0.525 0.988
## 20 0.772 0.836 0.984 0.253 0.209 0.007 -1.562 -0.586 0.981
## 21 0.772 0.836 0.984 0.253 0.209 0.007 -1.562 -0.586 0.981
## 22 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 23 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 24 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 25 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 26 0.860 0.919 0.964 0.112 0.078 0.008 -1.599 -0.202 0.827
## 27 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
⭐ The data looks very good after fixing the earlier samples swap. Both PCA and heatmap show separation between different times except for 72 and 120 hours which are mixed together. In PCA plots, the first component explains the majority of variation (27%) and separates samples at 1,4 and 12 hours from the rest (std, 24,72 and 120 hrs). The second component explains further variation (16%) separating early stages (std, 1 and 4 hours) from later stages. Together the PCA and Heatmap plots suggest that 4 and 12hrs conditions have the strongest effect. This is less in 60 min but still there is a clear effect. While 72hr and 120hr clustered separately from other times, they do not show much difference between themselves.
sessionInfo()## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.66.0 tximport_1.26.1
## [3] lubridate_1.9.2 forcats_1.0.0
## [5] stringr_1.5.0 dplyr_1.1.1
## [7] purrr_1.0.1 readr_2.1.4
## [9] tidyr_1.3.0 tibble_3.2.1
## [11] tidyverse_2.0.0 scatterplot3d_0.3-43
## [13] RColorBrewer_1.1-3 pvclust_2.2-0
## [15] plotly_4.10.1 pheatmap_1.0.12
## [17] pander_0.6.5 hyperSpec_0.100.0
## [19] xml2_1.3.3 ggplot2_3.4.2
## [21] lattice_0.20-45 here_1.0.1
## [23] gplots_3.1.3 emoji_15.0
## [25] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [27] Biobase_2.58.0 MatrixGenerics_1.10.0
## [29] matrixStats_0.63.0 GenomicRanges_1.50.2
## [31] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [33] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [35] data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.1-0 deldir_1.0-6 ellipsis_0.3.2
## [4] rprojroot_2.0.3 XVector_0.38.0 rstudioapi_0.14
## [7] hexbin_1.28.3 farver_2.1.1 affyio_1.68.0
## [10] bit64_4.0.5 AnnotationDbi_1.60.2 fansi_1.0.4
## [13] codetools_0.2-19 cachem_1.0.7 geneplotter_1.76.0
## [16] knitr_1.42 jsonlite_1.8.4 annotate_1.76.0
## [19] png_0.1-8 BiocManager_1.30.20 compiler_4.2.0
## [22] httr_1.4.5 Matrix_1.5-3 fastmap_1.1.1
## [25] lazyeval_0.2.2 limma_3.54.2 cli_3.6.1
## [28] htmltools_0.5.5 tools_4.2.0 gtable_0.3.3
## [31] glue_1.6.2 GenomeInfoDbData_1.2.9 affy_1.76.0
## [34] Rcpp_1.0.10 jquerylib_0.1.4 vctrs_0.6.1
## [37] Biostrings_2.66.0 preprocessCore_1.60.2 crosstalk_1.2.0
## [40] xfun_0.38 brio_1.1.3 testthat_3.1.7
## [43] timechange_0.2.0 lifecycle_1.0.3 gtools_3.9.4
## [46] XML_3.99-0.14 zlibbioc_1.44.0 scales_1.2.1
## [49] vroom_1.6.1 hms_1.1.3 yaml_2.3.7
## [52] memoise_2.0.1 sass_0.4.5 latticeExtra_0.6-30
## [55] stringi_1.7.12 RSQLite_2.3.1 highr_0.10
## [58] caTools_1.18.2 BiocParallel_1.32.6 rlang_1.1.0
## [61] pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.20
## [64] labeling_0.4.2 htmlwidgets_1.6.2 bit_4.0.5
## [67] tidyselect_1.2.0 magrittr_2.0.3 R6_2.5.1
## [70] generics_0.1.3 DelayedArray_0.24.0 DBI_1.1.3
## [73] pillar_1.9.0 withr_2.5.0 KEGGREST_1.38.0
## [76] RCurl_1.98-1.12 crayon_1.5.2 interp_1.1-4
## [79] KernSmooth_2.23-20 utf8_1.2.3 tzdb_0.3.0
## [82] rmarkdown_2.21 jpeg_0.1-10 locfit_1.5-9.7
## [85] blob_1.2.4 digest_0.6.31 xtable_1.8-4
## [88] munsell_0.5.0 viridisLite_0.4.1 bslib_0.4.2
Created by Aman Zare